Data Science and visualization
R and dplyr

Etienne Côme

September 14, 2021

R

Rstudio

https://learnr-examples.shinyapps.io/ex-setup-r/

R & Rstudio install

Download and install R

https://cloud.r-project.org/

Download and install Rstudio

https://rstudio.com/products/rstudio/download/

Install packages

install.packages("dplyr")
install.packages("readr")
install.packages("tidyr")
install.packages("ggplot2")

Basic types

the vectors :

# integer vector
a = c(1,5,10)
class(a)
## [1] "numeric"
# character vector 
b = c("a", "g", "t", "c", "g")
class(b)
## [1] "character"
  • allows to store elements of the same type
  • basic operations c, length, seq, rep, logical indexing
  • numbering starts at 1!

Basic types

vectors, basic manipulations :

length(a)
## [1] 3
a[1:2]
## [1] 1 5
i = 1:2;
a[i]
## [1] 1 5
i = (b=="g")
b[i]
## [1] "g" "g"

Basic types

vectors, basic manipulations :

i = seq(1,length(b),2);b [i]
## [1] "a" "t" "g"
i = rep(1.5);b[i]
## [1] "a"
i = rep(c(1,2),5);b[i]
##  [1] "a" "g" "a" "g" "a" "g" "a" "g" "a" "g"
i = rep(c(1,2),each=3);b[i]
## [1] "a" "a" "a" "g" "g" "g"

Basic types, factors :

vectors, basic manipulations:

b = c("a", "g", "t", "c", "g")
c = factor(b,levels=c("a", "t", "g", "c"))
levels(c)
## [1] "a" "t" "g" "c"
unclass(c)
## [1] 1 3 2 4 3
## attr(,"levels")
## [1] "a" "t" "g" "c"
  • special type of vectors for coding categorical variables “the levels”.
  • basic operations c, length, levels, unclass
  • ! interpretation of strings as factors when creating a data.frame

Basic types

Matrix :

# integer matrix
a = matrix(c(1,5,10,10),2,2)
# of string 
b = rbind(c("a", "g"),c("t", "t"),c("c", "g"))
c = cbind(c("a", "g"),c("t", "t"),c("c", "g"))
  • allows to store elements of the same type
  • basic operations dim, rbind, cbind, logical indexing

Basic types

Matrix :

dim(b)
## [1] 3 2
t(b)
##      [,1] [,2] [,3]
## [1,] "a"  "t"  "c" 
## [2,] "g"  "t"  "g"
dim(t(b))
## [1] 2 3
a[1,]
## [1]  1 10
b[,2]
## [1] "g" "t" "g"
c[c[,1]=="a",]
## [1] "a" "t" "c"
  • allows to store elements of the same type
  • basic operations dim, rbind, cbind, logical indexing

Basic types

The arrays :

# Tensor 3 dimensions
a = array(runif(50),dim=c(5,5,2))
a[1,,]
a[,5,]
a[,2,1]
  • allows to store elements of the same type
  • basic operations dim, logical indexing

Basic types

Lists :
l = list(a,b,c)
length(l)
## [1] 3
l[[2]]
##      [,1] [,2]
## [1,] "a"  "g" 
## [2,] "t"  "t" 
## [3,] "c"  "g"
l = list(a=a,b=b,c=c)
  • allows to store elements of different types
  • base operations length, c

Basic types

Lists :
l$c
##      [,1] [,2] [,3]
## [1,] "a"  "t"  "c" 
## [2,] "g"  "t"  "g"
l[[2]]
##      [,1] [,2]
## [1,] "a"  "g" 
## [2,] "t"  "t" 
## [3,] "c"  "g"
  • allows to store elements of different types
  • base operations length, c

Basic types

The data.frame :

d = data.frame(v1=rep("a",10),v2=1:10,v3=runif(10))
dim(d)
d$v1
d$v4 = factor(rep(c("a", "b"),5),levels=c("a", "b"))
d[d$v4=="a",]
d[, "v2"]
d[,c(3,1)]
d[,c("v2", "v4")]
names(d)
summary(d)
  • allows to store elements of different types
  • = list of named vectors indexable and manipulable as a matrix
  • basic operations dim, cbind, rbind, names, summary

Functions

f = function(a,b){
  return(a-b)
}
f(5,6)
f(b=5,a=6)
f = function(a=32,b=12){
  a-b
}
f()
f(5,6)
f(b=5,a=6)
  • named argument and default value
  • no need for explicit return

Read data


data = read.table("filename")
data = read.csv("filename")
# performant version
library(readr)
data = read_csv("filname")
data = read_delim("filename")

Loops and flow control

for (i in 1:length(a)){}
while(i > 4){i=i-1}

! avoid for loops (use vectors)

a=runif(100000)
t=Sys.time()
for (i in 1:length(a)){a[i]=a[i]+5}
t1=Sys.time()-t
t1
## Time difference of 0.01478052 secs

Loops and flow control

for (i in 1:length(a)){}
while(i > 4){i=i-1}

! avoid for loops (use vectors)

Vectorial version

t=Sys.time()
a=a+5
t2=Sys.time()-t
t2
## Time difference of 0.002064466 secs
as.numeric(t1)/as.numeric(t2)
## [1] 7.159487

Some vector functions

sum, cumulated sum (cumsum), finite differences (diff), max, min …

a=data.frame(v1=runif(5000),v2=rnorm(5000),v3=rbinom(5000,5,0.2))
# basic algebraic operation
a$v1+a$v2;a$v1*a$v2;a$v1/a$v2

# matrix product
t(a$v1)%*%a$v2

# sum and cumulative sum
sum(a$v2);cumsum(a$v1)

# difference
diff(a$v2)

Some vector functions

sum, cumulated sum (cumsum), finite differences (diff), max, min …

# max, min ....
max(a$v3)
which.max(a$v1)
which(a$v1>0.2)

# string concatenation
paste(a$v1,a$v2);paste0(a$v1,a$v2)

# are on the matrices
b=matrix(runif(100),10,10)
sum(b);rowSums(b);colSums(b)

Apply, lapply, sapply

Apply a function to each element of an object

a=data.frame(v1=runif(5000),v2=rnorm(5000),v3=rbinom(5000,5,0.2))
# apply to each line
r=apply(a,1,sum)
head(r);class(r);dim(r)

# apply to each column
r=apply(a,2,function(col){c(max(col),which.max(col))})
r;class(r);dim(r)

# apply to all elements of a list
b=list(v1=runif(5000),v2=rnorm(5000),v3=rbinom(5000,5,0.2))
r=lapply(b,which.max)
r;class(r)
# simplification of the result
r=sapply(b,which.max)
r;class(r)

better than loops…

Subset: sample, logical indexing

Select a part of the data

#logical indexing
a[a$v1>0.98 & a$v3==3,]
##             v1          v2 v3
## 806  0.9963570 -0.93142049  3
## 2301 0.9959883 -0.05808224  3
## 4597 0.9905205 -0.80389022  3
#substitute function
subset(a,v1>0.98 & v3==3)
##             v1          v2 v3
## 806  0.9963570 -0.93142049  3
## 2301 0.9959883 -0.05808224  3
## 4597 0.9905205 -0.80389022  3

Binning: cut

Pretreat variables to construct factors // intervals

r=cut(a$v2,c(-Inf,-3,-2,2,1,Inf))
class(r);head(r)
## [1] "factor"
## [1] (-2,1] (-2,1] (-2,1] (1,2]  (-2,1] (-2,1]
## Levels: (-Inf,-3] (-3,-2] (-2,1] (1,2] (2, Inf]

Match, %in%, setdiff, intersect

a = 10:100
b = 50:110
setdiff(a,b)
##  [1] 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [26] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
intersect(a,b)
##  [1]  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
## [20]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
## [39]  88  89  90  91  92  93  94  95  96  97  98  99 100

Match, %in%, setdiff, intersect

a = 10:100
b = 50:110
a %in% b
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [49]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [61]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [73]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [85]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
match(a,b)
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
## [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA  1  2  3  4  5  6  7  8  9 10
## [51] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## [76] 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

Counting: table

data = data.frame(v1=rep(c("a","t","g","c"),500/4),v2=rbinom(500,10,0.4))
table(data$v1)
## 
##   a   c   g   t 
## 125 125 125 125
table(data[,c('v1', 'v2')])
##    v2
## v1   0  1  2  3  4  5  6  7  8  9
##   a  0  1 17 31 27 28 14  6  1  0
##   c  3  6 16 21 33 22 17  4  3  0
##   g  0  8 21 26 31 23 12  3  1  0
##   t  2  2 13 27 23 31 19  7  0  1

dplyr,tidyr

2 libraries for easy data manipulation (Cheatsheet)
! Introduction of a new operator !

Sequence of operations, introduction of the pipe operator :

%>%

 x%>% f(y) =f(x,y) 
 x%>% f(y) %>% g(z) = g(f(x,y),z) 

Easy to write, Easy to read

dplyr,tidyr

Selection of lines “filter”

data %>% filter(condition)
data %>% distinct(v1)
data %>% sample_n(15,replace=FALSE)
data %>% sample_frac(0.2)
data %>% top_n(5,v1)
data %>% slice(20:30)

dplyr,tidyr

Column selection “select”

data %>% select(v1,v2)
data %>% select(contains('var'))
data %>% select(-v3)
data %>% pull(v3)

dplyr,tidyr

Transformation “mutate”

data %>% mutate(v3=v1/v2)

data %>% rename(v4=v1)

data %>% arrange(v4)
data %>% arrange(desc(v4))

dplyr,tidyr

“summarizes”

data %>% summarize(v1m=mean(v1))

With grouped data “group_by”

data %>% group_by(group) %>% summarise(v1m=mean(v1))
data %>% group_by(group) %>% summarise(v1med=median(v1))
Aggregation function : mean,median,n,sum,max,min,… Shortcut

data %>% goup_by(v4) %>% summarize(n=n())
data %>% count(v4)

dplyr,tidyr

vector functions

 
data1 %>% mutate(v2=cumsum(v1))
data1 %>% mutate(v2=if_else(v1==32, "a", "b"))
data1 %>% mutate(v2=case_when(v1==32 ~ "a",v1==33 & v4<5 ~"b", TRUE ~ c))

offset

 
data1 %>% mutate(v2=lag(v1))
data1 %>% mutate(v2=lead(v4))

! after a group_by to mutate by groups.

dplyr,tidyr

Join tables : “X_join”

 
data1 %>% left_join(data2, by=c("v1"="v2"))
data1 %>% right_join(data2)
data1 %>% inner_join(data2)
data1 %>% full_join(data2)

dplyr,tidyr

long format

library(dplyr)
library(tidyr)
df=expand_grid(year=2015:2020,
               countries=c("France", "Italy", "Morocco"))
df$value=runif(nrow(df))
df[1:3,]
## # A tibble: 3 × 3
##    year countries value
##   <int> <chr>     <dbl>
## 1  2015 France    0.746
## 2  2015 Italy     0.542
## 3  2015 Morocco   0.751

tidyr

  • tidyr::pivot_long: wide format -> long format
  • tidyr::pivot_wider: long format -> wide format
  • tidyr::separate: split of u
  • tidyr::unite: concatenation of columns

dplyr,tidyr

long format -> wide format

dflarge = df %>% pivot_wider(names_from = year,values_from = value)
dflarge
## # A tibble: 3 × 7
##   countries `2015` `2016` `2017` `2018` `2019` `2020`
##   <chr>      <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 France     0.746 0.436   0.574  0.775  0.419  0.984
## 2 Italy      0.542 0.0340  0.786  0.933  0.263  0.813
## 3 Morocco    0.751 0.106   0.405  0.686  0.819  0.349

dplyr,tidyr

Wide format -> long format

dflong = dflarge %>% pivot_longer(cols = -1,values_to = "value",names_to = "year")
dflong
## # A tibble: 18 × 3
##    countries year   value
##    <chr>     <chr>  <dbl>
##  1 France    2015  0.746 
##  2 France    2016  0.436 
##  3 France    2017  0.574 
##  4 France    2018  0.775 
##  5 France    2019  0.419 
##  6 France    2020  0.984 
##  7 Italy     2015  0.542 
##  8 Italy     2016  0.0340
##  9 Italy     2017  0.786 
## 10 Italy     2018  0.933 
## 11 Italy     2019  0.263 
## 12 Italy     2020  0.813 
## 13 Morocco   2015  0.751 
## 14 Morocco   2016  0.106 
## 15 Morocco   2017  0.405 
## 16 Morocco   2018  0.686 
## 17 Morocco   2019  0.819 
## 18 Morocco   2020  0.349

More

https://r4ds.had.co.nz/

dplyr,tidyr, exercises

Exercise 1, 2 and 3 of GoT: https://comeetie.github.io/got/got_tp.html

dplyr,tidyr, extra exercise

Baby names

Make a card representing the male first names most frequently given to children born in 2005 for all French departments. The data to be used are available in the data directory:

dplyr,tidyr, exercise